## Setup

# clear workspace
rm(list = ls()); gc()

# install older version of stm and tm packages
require(devtools)
remove.packages("stm"); install_version("stm", version = "1.3.6")
remove.packages("tm"); install_version("tm", version = "0.7-8")

# load packages
library(stm)
library(tm)
library(tidyverse)
library(ggplot2)
library(geometry)
library(Rtsne)
library(rsvd)
library(tidystm) # devtools::install_github("mikajoh/tidystm", dependencies = TRUE)

# set working directory to source file location
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))


# ~~~~~~~~~~


# This script takes about 3.75 hours to run


# ~~~~~~~~~~


## Load and prep preprocessed text data

# load data from data/processed folder
setwd("../data/processed")
data <- readRDS("text_preprocessed.rds")

# add week variable
data$date <- as.Date(data$date, tryFormats = "%m/%d/%Y")
data$week <- as.numeric(strftime(data$date, format = "%V"))


# ~~~~~~~~~~


## Calculate and save the proportion of the corpus with pandemic-related keywords
setwd("../../output")
writeLines(paste("Proportion of ABC and USA Today daily coverage with containing pandemic death-related keywords:", 
                 sum((grepl("covid", data[data$source %in% c("ABC", "USA Today"),]$body) | 
                        grepl("coronavirus", data[data$source %in% c("ABC", "USA Today"),]$body) | 
                        grepl("pandemic", data[data$source %in% c("ABC", "USA Today"),]$body)) & 
                       (grepl("deathly", data[data$source %in% c("ABC", "USA Today"),]$body) | 
                          grepl("death", data[data$source %in% c("ABC", "USA Today"),]$body) | 
                          grepl("deaths", data[data$source %in% c("ABC", "USA Today"),]$body) | 
                          grepl("die", data[data$source %in% c("ABC", "USA Today"),]$body) | 
                          grepl("dies", data[data$source %in% c("ABC", "USA Today"),]$body) | 
                          grepl("fatality", data[data$source %in% c("ABC", "USA Today"),]$body) | 
                          grepl("fatalities", data[data$source %in% c("ABC", "USA Today"),]$body)))/nrow(data[data$source %in% c("ABC", "USA Today"),])), 
           "pg5_corpus_keywords.txt")


# ~~~~~~~~~~


## Preprocess data for topic model

# set the environment variable that makes the strict check a warning instead of an error
# this is necessary for using the older package versions, because R 4.4 introduced a stricter 
# numeric_version() check: comparing a numeric_version object (what packageVersion() returns) 
# to a numeric double (e.g. 0.6) now stops with an error instead of being silently accepted.
Sys.setenv("_R_CHECK_STOP_ON_INVALID_NUMERIC_VERSION_INPUTS_" = "false")

# stem words, remove stop words, etc. (note much of this is double-checking what was already done manually)
corpus <- textProcessor(documents = data$body, 
                        metadata = data,
                        lowercase = T,            # makes everything lowercase
                        removestopwords = T,      # removes common stopwords
                        removenumbers = T,        # removes numbers
                        removepunctuation = T,    # removes punctuation
                        stem = T,                 # stems words
                        wordLengths = c(3, Inf),  # removes all words less than 3 characters
                        sparselevel = 1,          # default setting
                        language = "en", 
                        verbose = F,
                        customstopwords = c("lastworddesk", "last word desk", "trms", "donnel", "ggutfeld", "foxnewspodcast", 
                                            "thaf", "whaf", "thafs", "whafs", "donnell", "gonna", "wanna", 
                                            "americastrong", "jasoninthehouse", "usatoday", "donnell"))

# transform corpus into necessary format and remove words in fewer than 0.5% of transcripts
documents <- prepDocuments(documents = corpus$documents,
                           vocab = corpus$vocab,
                           meta = corpus$meta, 
                           lower.thres = round(length(corpus$documents)*.005,0))


# ~~~~~~~~~~


## Investigate optimal number of topics

# fit topic model with values of k: 8, 10, 12, 15, 20, 25, 30, 40, 50, 60, 75 (takes about 2.75 hours)
ks <- searchK(documents = documents$documents, 
              vocab = documents$vocab, 
              K = c(8,10,12,15,20,25,30,40,50,60,75),
              prevalence = ~source,
              data = documents$meta)

# plot overall exclusivity vs. semantic coherence trade-off
p <- ks$results %>%
  ggplot(aes(x = as.numeric(semcoh), y = as.numeric(exclus))) + 
  geom_text(aes(label = K), hjust = 0, vjust = 0) + 
  geom_smooth(se = F) + 
  theme_bw() + 
  labs(x = "Semantic Coherence", y = "Exclusivity")

# save
ggsave(filename = "Figure A.2.png", plot = p, device = "png", width = 15, height = 10, units = "in", bg = "white")


## Run topic models for k in (15, 20, 25, 30) (takes about 45 minutes)

# k = 15
model_15 <- stm(documents = documents$documents, vocab = documents$vocab, K = 15, 
                prevalence = ~source, data = documents$meta, init.type = "Spectral")

# k = 20
model_20 <- stm(documents = documents$documents, vocab = documents$vocab, K = 20,
                prevalence = ~source, data = documents$meta, init.type = "Spectral")

# k = 25
model_25 <- stm(documents = documents$documents, vocab = documents$vocab, K = 25,
                prevalence = ~source, data = documents$meta, init.type = "Spectral")

# k = 30
model_30 <- stm(documents = documents$documents, vocab = documents$vocab, K = 30,
                prevalence = ~source, data = documents$meta, init.type = "Spectral")


# plot exclusivity vs. semantic coherence trade-off disaggregated by topic for k in (15, 20, 25, 30)
ex_15 <- exclusivity(model_15)
sc_15 <- semanticCoherence(model = model_15, documents = documents$documents)
ex_20 <- exclusivity(model_20)
sc_20 <- semanticCoherence(model = model_20, documents = documents$documents)
ex_25 <- exclusivity(model_25)
sc_25 <- semanticCoherence(model = model_25, documents = documents$documents)
ex_30 <- exclusivity(model_30)
sc_30 <- semanticCoherence(model = model_30, documents = documents$documents)

toplot <- as.data.frame(rbind(cbind(1:15, ex_15, sc_15, rep("Model 15", 15)),
                              cbind(1:20, ex_20, sc_20, rep("Model 20", 20)),
                              cbind(1:25, ex_25, sc_25, rep("Model 25", 25)),
                              cbind(1:30, ex_30, sc_30, rep("Model 30", 30))))
colnames(toplot) <- c("K", "Exclusivity", "Semantic Coherence", "Model")
toplot$Exclusivity <- as.numeric(toplot$Exclusivity)
toplot$`Semantic Coherence` <- as.numeric(toplot$`Semantic Coherence`)

p <- ggplot(data = toplot, aes(x = `Semantic Coherence`, y = Exclusivity, color = Model)) + 
  geom_point(size = 2, alpha = 0.7) + 
  geom_text(aes(label = K), nudge_x = 0, nudge_y = .05) + 
  theme_bw()

# save
ggsave(filename = "Figure A.3.png", plot = p, device = "png", width = 15, height = 10, units = "in", bg = "white")


# ~~~~~~~~~~


## Explore k = 30

# examine associated words
labs <- labelTopics(model_30, n = 10)

# Topic 1 is George Floyd
# Topic 2 is Democratic presidential primary
# Topic 3 is schooling
# Topic 4 is year-end
# Topic 5 is terrorism
# Topic 6 is Hunter Biden
# Topic 7 is vote-by-mail
# Topic 8 is impeachment 1
# Topic 9 is small talk
# Topic 10 is general election
# Topic 11 is Russia investigation
# Topic 12 is civil rights
# Topic 13 is COVID economy
# Topic 14 is COVID 1
# Topic 15 is miscellanea 1
# Topic 16 is sexual crimes
# Topic 17 is Trump diagnosis
# Topic 18 is COVID 2
# Topic 19 is impeachment 2
# Topic 20 is SCOTUS
# Topic 21 is COVID 3
# Topic 22 is Kenosha
# Topic 23 is Taylor/Arbery
# Topic 24 is COVID 4
# Topic 25 is Roger Stone
# Topic 26 is weather
# Topic 27 is Fox targets
# Topic 28 is BLM
# Topic 29 is disasters
# Topic 30 is miscellanea 2

# create labels
labs_30 <- c("George Floyd", "Dem primary", "schooling", "year-end", "terrorism", "Hunter Biden", "vote-by-mail", "impeachment 1", "small talk", 
             "general election", "Russia", "civil rights", "COVID economy", "COVID 1", "miscellanea 1", "sex crimes", "Trump diagnosis", "COVID 2", 
             "impeachment 2", "SCOTUS", "COVID 3", "Kenosha", "Taylor/Arbery", "COVID 4", "Roger Stone", "weather", "Fox targets", "BLM", "disasters", 
             "miscellanea 2")

# visualize and save topic prevalence
ggsave(filename = "Figure A.4.png", plot = plot.STM(x = model_30, type = "summary", custom.labels = labs_30, main = ""), 
       device = "png", width = 10, height = 15, units = "in", bg = "white")

# store top-10 highest probability, FREX, and lift for the COVID topics for Appendix A.5
tab_a5 <- as.data.frame(rbind(labs$prob[14,], labs$frex[14,], labs$lift[14,],
                              labs$prob[18,], labs$frex[18,], labs$lift[18,],
                              labs$prob[21,], labs$frex[21,], labs$lift[21,],
                              labs$prob[24,], labs$frex[24,], labs$lift[24,])) %>%
  mutate(model = c(rep("Topic 14", 3), rep("Topic 18", 3), rep("Topic 21", 3), rep("Topic 24", 3)),
         type = rep(c("Highest Prob", "FREX", "Lift"), 4))

# set working directory and save
setwd("../../output")
write.csv(tab_a5, "Appendix A.5.csv", row.names = F)


# ~~~~~~~~~~


## Extract and save aggregate COVID prevalence

# extract source-by-week estimates
eff <- estimateEffect(formula = ~s(week) + source + s(week):source, 
                      stmobj = model_30, 
                      metadata = documents$meta)

# extract results in data frame
results <- rbind(extract.estimateEffect(x = eff,
                                        covariate = "week",
                                        model = model_30, 
                                        method = "pointestimate",
                                        moderator = "source", 
                                        moderator.value = "Fox"), 
                 extract.estimateEffect(x = eff, 
                                        covariate = "week",
                                        model = model_30, 
                                        method = "pointestimate",
                                        moderator = "source", 
                                        moderator.value = "MSNBC"), 
                 extract.estimateEffect(x = eff, 
                                        covariate = "week",
                                        model = model_30, 
                                        method = "pointestimate",
                                        moderator = "source", 
                                        moderator.value = "ABC"), 
                 extract.estimateEffect(x = eff,
                                        covariate = "week",
                                        model = model_30, 
                                        method = "pointestimate",
                                        moderator = "source", 
                                        moderator.value = "USA Today")) %>%
  select(-c(method, covariate, ci.level, moderator, label)) %>%
  rename(week = covariate.value, source = moderator.value)

# collapse COVID topics into source-by-week estimates
results <- results %>%
  filter(topic %in% c(14, 18, 21, 24)) %>%
  group_by(source, week) %>%
  summarize(estimate = sum(estimate))

# visualize aggregate COVID-19 topic prevalence by week on ABC and USA Today
p <- results %>%
  filter(source %in% c("ABC", "USA Today")) %>%
  ggplot(aes(x = week, y = estimate, color = source)) + 
  geom_line(linewidth = 1.5) + 
  theme_bw() + 
  labs(x = "", y = "Expected Topic Proportion", color = "") + 
  theme(legend.position = "bottom",
        text=element_text(size=18)) + 
  scale_x_continuous(breaks = c(1, 31/7, 59/7, 90/7, 120/7, 151/7, 181/7, 212/7, 243/7, 273/7, 304/7, 334/7), 
                     labels = c("Jan", "Feb", "Mar", "Apr", "May", "June", "July", "Aug", "Sept", "Oct", "Nov", "Dec"))

# save Figure 2
ggsave(filename = "Figure 2.png", plot = p, device = "png", width = 15, height = 10, units = "in", bg = "white")

# set working directory and save topic model
setwd("../data/processed")
saveRDS(results, "topic_model.rds")

# calculate and save correlation between ABC and USA Today topic prevalence | number in main text (pg. 12)
setwd("../../output")
writeLines(paste("Correlation between USA Today and ABC:", round(cor(results[results$source == "ABC",]$estimate, results[results$source == "USA Today",]$estimate),3)), "pg12_correlation_topics.txt")


# ~~~~~~~~~~


## Calculate correlation between STM and hand-coding for footnote 9

# load hand-coding
setwd("../data/raw")
hand_coding <- read.csv("hand_coding.csv")
hand_coding <- subset(hand_coding, select = c(week, attention_fox, attention_msnbc))

# calculate and save correlations
setwd("../../output")
writeLines(paste(paste("Correlation between Fox and STM:", 
                       round(cor(results[results$source == "Fox" & results$week %in% hand_coding$week,]$estimate, hand_coding$attention_fox, "pairwise.complete.obs"),3)), 
                 paste("Correlation between MSNBC and STM:", 
                       round(cor(results[results$source == "MSNBC" & results$week %in% hand_coding$week,]$estimate, hand_coding$attention_msnbc, "pairwise.complete.obs"),3)), sep = "\n"), "footnote_9.txt")

